R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents.

# get data for hosp_5_years
outcome <- "hosp_5_year"
load(paste("data", outcome, "train_data.Rda", sep="/"))
load(paste("data", outcome, "test_data.Rda", sep="/"))
load(paste("data", outcome, "train_means.Rda", sep="/"))
load(paste("data", outcome, "train_sds.Rda", sep="/"))
# Get only required columns

preds_cts = c(
  "decades_from_start", # This is for imputing and including time (hospital effects)
  "age",
  "age_2", 
  "height",
  "weight",
  "bmi", 
  "fev1",
  "fvc", 
  "fivc",
  "fev1_fvc",
  "fev1_fev6",
  "pef", 
  "mmef",
  "dlco",
  "spo2"
)
preds_bin = c("hosp_past_year", "sex") # Binary predictors
preds_categ = c("smoking_status", "mmrc") # Categorical
preds_inter = c("sex:age")

train_data <- train_data[c(preds_bin, preds_cts, preds_categ, outcome)]
train_data
## # A tibble: 4,544 × 20
##    hosp_past_year sex   decades_from_start     age  age_2 height   weight     bmi
##    <lgl>          <fct>              <dbl>   <dbl>  <dbl>  <dbl>    <dbl>   <dbl>
##  1 FALSE          1                 0.101  -0.0993 0.0179  0.777  0.166   -0.206 
##  2 FALSE          0                 0.254   0.484  0.425  -0.978 -1.01    -0.802 
##  3 FALSE          1                 0.456  -1.15   2.39    1.15   0.00241 -0.514 
##  4 FALSE          1                -0.0506  0.0809 0.0119 -0.127 -0.161   -0.109 
##  5 FALSE          1                -0.431  -0.260  0.123   0.245  0.0569  -0.0586
##  6 FALSE          1                 0.633   0.372  0.252  -0.606 -0.325   -0.0489
##  7 FALSE          1                 0.354  -0.199  0.0719  0.352  0.520    0.376 
##  8 FALSE          1                 0.786  -0.299  0.162   1.10   0.166   -0.346 
##  9 FALSE          0                 0.583   0.154  0.0429 -0.180 -0.842   -0.890 
## 10 FALSE          1                -0.0248 -0.767  1.07    0.139 -0.188   -0.274 
## # ℹ 4,534 more rows
## # ℹ 12 more variables: fev1 <dbl>, fvc <dbl>, fivc <dbl>, fev1_fvc <dbl>,
## #   fev1_fev6 <dbl>, pef <dbl>, mmef <dbl>, dlco <dbl>, spo2 <dbl>,
## #   smoking_status <fct>, mmrc <fct>, hosp_5_year <lgl>
library(mice)

nimp <- 10
pred_mat <- make.predictorMatrix(train_data)
# pred_mat[c("height", "weight"), c("bmi", "dlco")] <- 0
meth <- make.method(train_data)
imp <- futuremice(train_data, n.core=5,
      predictorMatrix = pred_mat, method = meth,
      m = nimp, maxit = 20, print = FALSE
  )
imp$loggedEvents
## NULL
plot(imp)

Imputation here shows height, weight, and bmi not being properly mixed together. The other variables are all sufficiently mixed.

# Don't apply passive imputation since BMI is no longer maintaining its ratio to weight and height

train_data %>% 
  select(weight, height, bmi) %>%
  mutate(bmi_calc = weight / height^2)
## # A tibble: 4,544 × 4
##      weight height     bmi  bmi_calc
##       <dbl>  <dbl>   <dbl>     <dbl>
##  1  0.166    0.777 -0.206    0.275  
##  2 -1.01    -0.978 -0.802   -1.05   
##  3  0.00241  1.15  -0.514    0.00183
##  4 -0.161   -0.127 -0.109   -9.97   
##  5  0.0569   0.245 -0.0586   0.946  
##  6 -0.325   -0.606 -0.0489  -0.884  
##  7  0.520    0.352  0.376    4.21   
##  8  0.166    1.10  -0.346    0.138  
##  9 -0.842   -0.180 -0.890  -25.9    
## 10 -0.188    0.139 -0.274   -9.77   
## # ℹ 4,534 more rows
# However, there is a strong correlation
library(naniar)
library(ggplot2)
ggplot(train_data, 
       aes(x=weight,
           y=bmi)) +
  geom_miss_point(alpha=0.1)

ggplot(train_data, 
       aes(x=height,
           y=bmi)) +
  geom_miss_point(alpha=0.1)

ggplot(train_data, 
       aes(x=weight,
           y=height)) +
  geom_miss_point(alpha=0.1)

There is a strong correlation between the standardised weight and bmi. Relations with height and bmi is hard to see (1/height^2 prop BMI). Weak correlation between height and weight. Therefore we need to remove feedback loop between weight and bmi. Also remove feedback between height and bmi.

pred_mat[c("weight", "height"), "bmi"] <- 0
imp.pas <- futuremice(train_data, n.core=5,
      predictorMatrix = pred_mat, method = meth,
      m = 10, maxit = 20, print = FALSE
  )
plot(imp.pas)

Weight, height and BMI now have better mixing in their chains. Check quickly the other interactions.

ggplot(train_data, 
       aes(x=height,
           y=age)) +
  geom_miss_point(alpha=0.1)

ggplot(train_data, 
       aes(x=height,
           y=fev1)) +
  geom_miss_point(alpha=0.1)

ggplot(train_data, 
       aes(x=height,
           y=fvc)) +
  geom_miss_point(alpha=0.01)

ggplot(train_data, 
       aes(x=height,
           y=fev1_fvc)) +
  geom_miss_point(alpha=0.1)

ggplot(train_data, 
       aes(x=height,
           y=fev1_fev6)) +
  geom_miss_point(alpha=0.1)

ggplot(train_data, 
       aes(x=height,
           y=pef)) +
  geom_miss_point(alpha=0.1)

ggplot(train_data, 
       aes(x=height,
           y=mmef)) +
  geom_miss_point(alpha=0.1)

ggplot(train_data, 
       aes(x=fev1,
           y=fvc)) +
  geom_miss_point(alpha=0.01)

# Look at shadows in distributions
data_shadow <- bind_shadow(train_data)
data_shadow
## # A tibble: 4,544 × 40
##    hosp_past_year sex   decades_from_start     age  age_2 height   weight     bmi
##    <lgl>          <fct>              <dbl>   <dbl>  <dbl>  <dbl>    <dbl>   <dbl>
##  1 FALSE          1                 0.101  -0.0993 0.0179  0.777  0.166   -0.206 
##  2 FALSE          0                 0.254   0.484  0.425  -0.978 -1.01    -0.802 
##  3 FALSE          1                 0.456  -1.15   2.39    1.15   0.00241 -0.514 
##  4 FALSE          1                -0.0506  0.0809 0.0119 -0.127 -0.161   -0.109 
##  5 FALSE          1                -0.431  -0.260  0.123   0.245  0.0569  -0.0586
##  6 FALSE          1                 0.633   0.372  0.252  -0.606 -0.325   -0.0489
##  7 FALSE          1                 0.354  -0.199  0.0719  0.352  0.520    0.376 
##  8 FALSE          1                 0.786  -0.299  0.162   1.10   0.166   -0.346 
##  9 FALSE          0                 0.583   0.154  0.0429 -0.180 -0.842   -0.890 
## 10 FALSE          1                -0.0248 -0.767  1.07    0.139 -0.188   -0.274 
## # ℹ 4,534 more rows
## # ℹ 32 more variables: fev1 <dbl>, fvc <dbl>, fivc <dbl>, fev1_fvc <dbl>,
## #   fev1_fev6 <dbl>, pef <dbl>, mmef <dbl>, dlco <dbl>, spo2 <dbl>,
## #   smoking_status <fct>, mmrc <fct>, hosp_5_year <lgl>, hosp_past_year_NA <fct>,
## #   sex_NA <fct>, decades_from_start_NA <fct>, age_NA <fct>, age_2_NA <fct>,
## #   height_NA <fct>, weight_NA <fct>, bmi_NA <fct>, fev1_NA <fct>, fvc_NA <fct>,
## #   fivc_NA <fct>, fev1_fvc_NA <fct>, fev1_fev6_NA <fct>, pef_NA <fct>, …

Examine is there a correlation between missing data and not taking certain measurements? Specifically, is age or breathlessness particularly correlated with missingness?

ggplot(data_shadow,
       aes(x=age,
           colour = mmrc_NA)) +
  geom_density()

ggplot(data_shadow,
       aes(x=fev1_fvc,
           colour = mmrc_NA)) +
  geom_density()

ggplot(data_shadow,
       aes(x=age,
           colour = smoking_status_NA)) +
  geom_density()

ggplot(data_shadow,
       aes(x=fev1_fvc,
           colour = smoking_status_NA)) +
  geom_density()

ggplot(data_shadow,
       aes(x=age,
           colour = dlco_NA)) +
  geom_density()

ggplot(data_shadow,
       aes(x=fev1_fvc,
           colour = dlco_NA)) +
  geom_density()

ggplot(data_shadow,
       aes(x=age,
           colour = fivc_NA)) +
  geom_density()

ggplot(data_shadow,
       aes(x=fev1_fvc,
           colour = fivc_NA)) +
  geom_density()

ggplot(data_shadow,
       aes(x=age,
           colour = spo2_NA)) +
  geom_density()

ggplot(data_shadow,
       aes(x=fev1_fvc,
           colour = spo2_NA)) +
  geom_density()

ggplot(data_shadow,
       aes(x=decades_from_start,
           colour = spo2_NA)) +
  geom_density()

ggplot(data_shadow,
       aes(x=age,
           colour = pef_NA)) +
  geom_density()

ggplot(data_shadow,
       aes(x=fev1_fvc,
           colour = pef_NA)) +
  geom_density()

ggplot(data_shadow,
       aes(x=age,
           colour = mmef_NA)) +
  geom_density()

ggplot(data_shadow,
       aes(x=fev1_fvc,
           colour = mmef_NA)) +
  geom_density()

For mmrc, smoking status, dlco, and fivc, we see a fairly unbiased relationship between age and fev1_fvc (representative of disease severity). However, the expriatory flow tests (PEF and MMEF) both have different missing data distributions. PEF has a higher proportion of missing values at mid-high FEV1/FVC values, and no missing at low FEV1_FVC. Similarly, PEF has little missingness at older ages, but most of its missing values are for younger patients. Inversely, MMEF is mostly missing its values around slightly below average values for FEV1_FVC, and around the mean age (whereas older or younger patients are having their MMEF measured). HOWEVER, these are both because of their incredibly small pct missingness (random correlations). This can be seen through use of the histogram

ggplot(data_shadow,
       aes(x=fev1_fvc,
           fill = mmef_NA)) +
  geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

We now look at the density plots for the imputed chains.

densityplot(imp.pas)

PEF and MMEF both are noisier, coming from their imputed chains consisting of a very small number of values from which to create a density plot. SPo2 imputations are discrete and plotting their densities is not a good visual representation. The other plots are all good, however. FIVC and DLCO with the most missing values are very tight fits to the original distributions.

We also can note that height, weight, bmi, fivc, pef, and dlco are all easily modelled with gaussians, so we can change the method.

meth_orig <- make.method(train_data)
meth_fast <- meth_orig
meth_fast[c("height", "weight", "bmi", "fivc", "pef", "dlco")] <- "norm"

start.time <- Sys.time()
futuremice(train_data, n.core=5,
      predictorMatrix = pred_mat, method = meth_orig,
      m = nimp, maxit = 20, print = FALSE
  ) 
## Class: mids
## Number of multiple imputations:  10 
## Imputation methods:
##     hosp_past_year                sex decades_from_start                age 
##                 ""                 ""                 ""                 "" 
##              age_2             height             weight                bmi 
##                 ""              "pmm"              "pmm"              "pmm" 
##               fev1                fvc               fivc           fev1_fvc 
##                 ""                 ""              "pmm"                 "" 
##          fev1_fev6                pef               mmef               dlco 
##              "pmm"              "pmm"              "pmm"              "pmm" 
##               spo2     smoking_status               mmrc        hosp_5_year 
##              "pmm"          "polyreg"          "polyreg"                 "" 
## PredictorMatrix:
##                    hosp_past_year sex decades_from_start age age_2 height weight
## hosp_past_year                  0   1                  1   1     1      1      1
## sex                             1   0                  1   1     1      1      1
## decades_from_start              1   1                  0   1     1      1      1
## age                             1   1                  1   0     1      1      1
## age_2                           1   1                  1   1     0      1      1
## height                          1   1                  1   1     1      0      1
##                    bmi fev1 fvc fivc fev1_fvc fev1_fev6 pef mmef dlco spo2
## hosp_past_year       1    1   1    1        1         1   1    1    1    1
## sex                  1    1   1    1        1         1   1    1    1    1
## decades_from_start   1    1   1    1        1         1   1    1    1    1
## age                  1    1   1    1        1         1   1    1    1    1
## age_2                1    1   1    1        1         1   1    1    1    1
## height               0    1   1    1        1         1   1    1    1    1
##                    smoking_status mmrc hosp_5_year
## hosp_past_year                  1    1           1
## sex                             1    1           1
## decades_from_start              1    1           1
## age                             1    1           1
## age_2                           1    1           1
## height                          1    1           1
end.time <- Sys.time()
time.taken <- end.time - start.time
print(time.taken)
## Time difference of 1.112067 mins
start.time <- Sys.time()
futuremice(train_data, n.core=5,
      predictorMatrix = pred_mat, method = meth_fast,
      m = nimp, maxit = 20, print = FALSE
  ) 
## Class: mids
## Number of multiple imputations:  10 
## Imputation methods:
##     hosp_past_year                sex decades_from_start                age 
##                 ""                 ""                 ""                 "" 
##              age_2             height             weight                bmi 
##                 ""             "norm"             "norm"             "norm" 
##               fev1                fvc               fivc           fev1_fvc 
##                 ""                 ""             "norm"                 "" 
##          fev1_fev6                pef               mmef               dlco 
##              "pmm"             "norm"              "pmm"             "norm" 
##               spo2     smoking_status               mmrc        hosp_5_year 
##              "pmm"          "polyreg"          "polyreg"                 "" 
## PredictorMatrix:
##                    hosp_past_year sex decades_from_start age age_2 height weight
## hosp_past_year                  0   1                  1   1     1      1      1
## sex                             1   0                  1   1     1      1      1
## decades_from_start              1   1                  0   1     1      1      1
## age                             1   1                  1   0     1      1      1
## age_2                           1   1                  1   1     0      1      1
## height                          1   1                  1   1     1      0      1
##                    bmi fev1 fvc fivc fev1_fvc fev1_fev6 pef mmef dlco spo2
## hosp_past_year       1    1   1    1        1         1   1    1    1    1
## sex                  1    1   1    1        1         1   1    1    1    1
## decades_from_start   1    1   1    1        1         1   1    1    1    1
## age                  1    1   1    1        1         1   1    1    1    1
## age_2                1    1   1    1        1         1   1    1    1    1
## height               0    1   1    1        1         1   1    1    1    1
##                    smoking_status mmrc hosp_5_year
## hosp_past_year                  1    1           1
## sex                             1    1           1
## decades_from_start              1    1           1
## age                             1    1           1
## age_2                           1    1           1
## height                          1    1           1
end.time <- Sys.time()
time.taken <- end.time - start.time
print(time.taken)
## Time difference of 1.098918 mins

There is no notable speedup, so continue to use pmm.

fit <- with(imp, glm(ici(imp) ~ age + height + weight + bmi + sex + fev1 + fvc + fivc + fev1_fvc + fev1_fev6 + pef + mmef + dlco + spo2 + smoking_status + mmrc, family=binomial))
ps <- rep(rowMeans(sapply(fit$analyses, fitted.values)), imp$m + 1)

# stripplot(imp.pas,data=mmef)

# Use some different methods of plotting spo2 to examine how well it has been imputed.
# SpO2 against probability of data being missing
xyplot(imp.pas, spo2 ~ ps | as.factor(.imp), alpha=0.05)

# SpO2 against severity
xyplot(imp.pas, spo2 ~ fev1_fvc | as.factor(.imp), alpha=0.05)

# Trying to increase distance spread of spo2
xyplot(imp.pas, age~spo2 | as.factor(.imp), na.groups=spo2, cex=c(0.5, 0.1), alpha=0.05)

xyplot(imp.pas, fev1_fvc~spo2 | as.factor(.imp), na.groups=spo2, cex=c(0.5, 0.2), alpha=c(0.05,0.05))

# Box and whiskers plot for each imputation
bwplot(imp.pas, data=spo2)

# Density plot again
densityplot(imp.pas, ~ spo2)

One of the biggest perceived issues is that spo2 has many missing values that get imputed around the lower values for spo2. However, this is just a visualisation issue where it’s hard to measure the relative densities. 80% of spo2 is missing, so it needs lots of imputations.

Overall, all imputations look appropriate.

imp.comp <- complete(imp.pas, "long")
imp.comp
##    .imp .id hosp_past_year sex decades_from_start          age        age_2
## 1     1   1          FALSE   1         0.10103728 -0.099313624 1.790955e-02
## 2     1   2          FALSE   0         0.25352041  0.483518926 4.245158e-01
## 3     1   3          FALSE   1         0.45599801 -1.146111559 2.385178e+00
## 4     1   4          FALSE   1        -0.05061262  0.080904335 1.188531e-02
## 5     1   5          FALSE   1        -0.43140383 -0.260359460 1.230874e-01
## 6     1   6          FALSE   1         0.63264514  0.372320610 2.517104e-01
## 7     1   7          FALSE   1         0.35434259 -0.199008666 7.191358e-02
## 8     1   8          FALSE   1         0.78596152 -0.298703707 1.620124e-01
## 9     1   9          FALSE   0         0.58348391  0.153758404 4.292839e-02
## 10    1  10          FALSE   1        -0.02478214 -0.766503517 1.066830e+00
## 11    1  11          FALSE   1         0.05020957 -0.517265913 4.858415e-01
## 12    1  12          FALSE   1        -0.48306478  0.291797692 1.546075e-01
## 13    1  13          FALSE   0         0.81095875  0.303300966 1.670377e-01
## 14    1  14          FALSE   0        -0.63388143 -1.069423066 2.076663e+00
## 15    1  15          FALSE   1         0.43016754 -0.728159270 9.627637e-01
## 16    1  16          FALSE   1        -0.48306478  0.069401061 8.745792e-03
## 17    1  17          FALSE   0        -0.38057612  0.180599377 5.922423e-02
## 18    1  18          FALSE   1        -0.91218398  0.556372995 5.620811e-01
## 19    1  19          FALSE   0        -0.71053962 -0.789510065 1.131833e+00
## 20    1  20          FALSE   0        -0.38057612 -0.547941311 5.451738e-01
## 21    1  21          FALSE   0        -0.38057612 -0.490424941 4.367290e-01
## 22    1  22          FALSE   1        -0.63388143 -0.241187337 1.056272e-01
## 23    1  23          FALSE   1        -0.43140383  0.391492734 2.783007e-01
## 24    1  24          FALSE   1        -0.10144033  0.069401061 8.745792e-03
## 25    1  25          FALSE   1        -0.60805096 -0.356220077 2.304113e-01
## 26    1  26           TRUE   0         0.78596152  0.395327158 2.837790e-01
## 27    1  27           TRUE   0         0.05020957 -0.268028310 1.304452e-01
## 28    1  28           TRUE   0         0.37933983  0.211274774 8.105172e-02
## 29    1  29           TRUE   1         0.53265620  0.069401061 8.745792e-03
## 30    1  30          FALSE   0        -0.60805096 -0.517265913 4.858415e-01
## 31    1  31          FALSE   1        -0.02478214  0.495022200 4.449552e-01
## 32    1  32          FALSE   1         0.27935089 -0.513431489 4.786653e-01
## 33    1  33          FALSE   0         0.22852317 -0.080141501 1.166224e-02
## 34    1  34          FALSE   1        -0.48306478  0.330141939 1.979103e-01
## 35    1  35          FALSE   1         0.10103728 -0.797178914 1.153928e+00
## 36    1  36          FALSE   1        -0.50806202  0.138420705 3.479117e-02
## 37    1  37           TRUE   0         0.10103728  0.230446897 9.642922e-02
## 38    1  38          FALSE   1        -0.86302275  0.659902461 7.907267e-01
## 39    1  39          FALSE   0         0.02437909 -0.007287432 9.643084e-05
## 40    1  40          FALSE   1        -0.43140383 -0.041797254 3.172214e-03
## 41    1  41          FALSE   0        -0.45723431  0.287963268 1.505709e-01
## 42    1  42          FALSE   0         0.37933983 -0.682146174 8.449321e-01
## 43    1  43           TRUE   1         0.27935089  0.241950172 1.062965e-01
## 44    1  44          FALSE   0        -0.93801446  0.832451571 1.258302e+00
## 45    1  45          FALSE   1         0.20269270  0.636895913 7.365527e-01
##         height       weight         bmi         fev1          fvc         fivc
## 1   0.77713507  0.165919925 -0.20645411  0.156279342  0.163720877  0.272825812
## 2  -0.97812521 -1.005879473 -0.80194940 -0.909744759 -0.915802485 -0.934576628
## 3   1.14946300  0.002413033 -0.51369338  2.007725875  1.673997111  1.705962180
## 4  -0.12708992 -0.161093860 -0.10928358 -0.319694518  0.090728194 -0.093431222
## 5   0.24523801  0.056915330 -0.05860811 -0.159725342 -0.399091125 -0.557356799
## 6  -0.60579727 -0.324600753 -0.04892677 -0.629143089 -0.528749180 -0.549882165
## 7   0.35161742  0.520184860  0.37593013  0.371975486  0.440804943  0.326144864
## 8   1.09627330  0.165919925 -0.34572878  0.763375479  0.793282767  0.576794235
## 9  -0.18027963 -0.842372581 -0.89040684 -0.103998375 -0.551799501 -0.197577780
## 10  0.13885860 -0.188345009 -0.27365567 -0.027291844  0.126744321  0.009718718
## 11  0.08566890 -0.351851902 -0.43100126  0.309036794  0.630009661  0.560350042
## 12  0.29842772  0.228597568  0.09730082 -0.286913949  0.109936795  0.185123448
## 13 -0.49941786 -1.005879473 -0.98127307 -0.081051977 -0.272794575 -0.413345529
## 14  0.13885860 -0.215596158 -0.30377607 -0.049582630  0.195415069  0.043105414
## 15 -0.02071051 -0.488107646 -0.53906847  0.232985874  0.451849889  0.335114424
## 16  0.67075566  0.165919925 -0.15688824  0.910232427  1.129433280  0.641076082
## 17 -0.49941786 -0.597112241 -0.45776172  0.183815020 -0.072544912 -0.144258729
## 18 -0.55260756 -0.444505808 -0.23466649 -0.581283458 -0.128249854 -0.230466167
## 19 -1.08450462 -0.242847307  0.37249022 -0.400334718 -0.375080374 -0.557855108
## 20 -0.18027963  0.629189455  0.85804109 -0.206929361  0.073920669 -0.312687134
## 21 -0.55260756 -0.515358795 -0.32657554 -0.430492841 -0.350109193 -0.484105392
## 22 -0.18027963  0.111417628  0.24284645  0.223151703  0.107535720  0.233957719
## 23  0.45799683  0.084166479 -0.13688102 -0.617342084 -0.063420827 -0.220499989
## 24  0.93670418  0.874449794  0.38285625  0.674212331  0.647297401  0.701869767
## 25  0.19204831  0.520184860  0.47520314  0.479495752  0.527243647  0.708846091
## 26 -0.23346933 -0.523534139 -0.48803949 -0.698637895 -0.502817569 -0.623631881
## 27 -0.87174580 -0.624363390 -0.30932492  0.182503798 -0.163305550 -0.124824682
## 28 -0.49941786  0.111417628  0.44965796 -0.170215124 -0.558042296 -0.578285772
## 29  0.77713507 -0.542609944 -0.88918625 -0.375421485  0.356287100  0.401389506
## 30 -0.07390022  0.029664181  0.08407622  0.196927248 -0.073985557 -0.205052413
## 31 -0.28665904  0.683691753  1.00617027  0.090062593 -0.207485332 -0.093929531
## 32  0.03247919  0.383929116  0.42564328  0.360174481 -0.007715884  0.206052421
## 33 -0.49941786 -0.569861092 -0.42286096  0.185126243 -0.144577165 -0.214520282
## 34  0.35161742 -0.079340414 -0.25643021 -0.454750462 -0.206524902 -0.517990396
## 35  0.61756595  1.146961283  0.84597829  1.469468933  1.129913495  1.200178657
## 36 -0.02071051  0.394829575  0.47242477 -0.499987647  0.202618294  0.081973507
## 37 -0.97812521 -0.678865688 -0.33065534 -0.536701885 -0.710750672 -0.768639768
## 38 -0.76536638 -0.016662772  0.46603235 -0.296748120 -0.395729620 -0.509020836
## 39 -0.18027963  0.193171074  0.33998244  0.225774148 -0.034607925  0.095427847
## 40  0.72394536  0.411180265  0.05708585  0.543745667  0.665065357  0.651540569
## 41 -0.44622815 -0.678865688 -0.58580432 -0.405579609 -0.113843404 -0.246910360
## 42 -0.28665904 -0.929576257 -0.95856530 -0.138090166 -0.146498025 -0.096421075
## 43  0.35161742  0.383929116  0.23221187 -0.338051637  0.554615903  0.625628506
## 44 -0.23346933 -0.351851902 -0.28152779 -0.534079439 -0.647842504 -0.868301546
## 45  0.19204831 -0.569861092 -0.71548026 -0.003034223  0.026859597  0.037624016
##       fev1_fvc   fev1_fev6         pef          mmef         dlco       spo2
## 1   0.04720582  0.15506913  0.48237335 -0.1219518394 -0.002394092  0.3629031
## 2  -0.87274235 -1.18527843 -0.97589525 -0.5764671571 -0.708260510  0.1248577
## 3   0.58469069  0.52376884  1.27295837  2.3907947748  0.651794114 -0.5420941
## 4  -0.65270566 -0.55771594 -0.51054672 -0.4421785405 -0.266580839  0.1248577
## 5   0.37524963  0.42702641  0.47310610 -0.0823539140 -0.314878233  0.5641677
## 6  -0.58616610 -0.73844318 -0.59880628 -0.4766115191 -0.762112105  0.5641677
## 7   0.03833451  0.01767715  0.27628727  0.1672851809  0.430833535 -0.1664863
## 8   0.16611891  0.38539336  0.71912962  0.1913882659  0.557855682 -0.5420941
## 9   0.93044872  0.89767208 -0.17560168  0.5968365891 -0.271652065  0.1248577
## 10 -0.20513937 -0.27047002  0.52848897 -0.2063126370  0.412239038  0.3629031
## 11 -0.22943267 -0.37868056 -0.15530198  0.1431820959  0.371910714 -0.5420941
## 12 -0.61554005 -0.58719138 -0.12066010 -0.3690084610 -0.263200021  0.7385109
## 13  0.29223467  0.25842665 -0.07101410 -0.0005755898 -0.567715092 -0.1664863
## 14 -0.31654596 -0.46122847 -0.50017622 -0.1503590468  0.008472822  0.7385109
## 15 -0.16352702 -0.14858075  0.33851027 -0.0272611483  0.686085264  0.1248577
## 16  0.02662141 -0.01565843  0.64101991  0.4220892226  0.601806311  0.1248577
## 17  0.45271677  0.45681677  0.33056690  0.2946872018 -0.351825740  0.1248577
## 18 -0.93213525 -0.87163370 -0.62682869 -0.4912455351 -0.302803885  0.8922926
## 19 -0.24861884 -0.35813955 -0.34506004 -0.3827816524  0.125352516 -0.5420941
## 20 -0.44627565 -0.43562966 -0.38521814 -0.3423229026  0.085990140  0.1248577
## 21 -0.35868347 -0.35429129 -0.38168776 -0.4809156415 -0.275032883  0.5641677
## 22  0.23288579  0.17477904 -0.03593092  0.1793367234  0.085024192 -0.1664863
## 23 -1.05129166 -0.71181403 -0.63697854 -0.5988485932 -0.620359252  0.3629031
## 24  0.20882590  0.27916749  0.61476269  0.3213727602  0.499657322 -0.1664863
## 25  0.08896125  0.11835396  0.36565008  0.1845016702  0.258653324 -0.5420941
## 26 -0.81151316 -0.88082532 -0.71552955 -0.5394517051 -0.824174256  1.4691649
## 27  0.62299960  0.66580981  0.11587552  0.4995634245 -0.205484635 -0.5420941
## 28  0.76267115  0.73610660  0.16530088  0.4100376801 -0.222388723  0.7385109
## 29 -0.93814718 -0.97773626 -0.31615503 -0.4129105087 -0.543807882  0.1248577
## 30  0.47982064  0.45122546  0.52804767  0.2137697020  0.185482772 -0.5420941
## 31  0.52501436  0.54463991  0.67058686  0.2111872286  0.001711186  0.3629031
## 32  0.65738850  0.69075053  0.91219741  0.6708674931  0.016200405 -0.1664863
## 33  0.59061830  0.49589334  0.09425193  0.5400221744 -0.050691486  0.1248577
## 34 -0.60104609 -0.53939909 -0.33049721 -0.5170702690 -0.705121179  0.1248577
## 35  0.59272538  0.60669033  0.80871308  1.8725784466  1.194415339 -0.5420941
## 36 -1.02945024 -0.95056188 -0.42625883 -0.5076011999 -0.752935600  0.3629031
## 37  0.07477897  0.05913110 -0.52577149 -0.4060239130 -0.381045663 -0.1664863
## 38  0.03636683  0.04534656 -0.62682869 -0.2700136475 -0.260060691 -0.5420941
## 39  0.46319913  0.43527815  0.20038405  0.3153469889 -0.264890430 -0.1664863
## 40  0.02936738  0.22889585  0.66793908 -0.0470601110  0.806104289 -0.5420941
## 41 -0.60594590 -0.52222808 -0.61138326 -0.4542300830 -0.759938722  0.1248577
## 42 -0.04278908 -0.13548838 -0.06924891 -0.0487817599 -0.025335354 -0.1664863
## 43 -1.00061940 -0.93562076 -0.13279579 -0.4671424500 -0.311497416  0.3629031
## 44 -0.07729268 -0.04712651 -0.73009237 -0.3096115729 -0.917146740 -0.5420941
## 45 -0.03941545 -0.23448141  0.45148250 -0.1193693660  0.104584636  0.1248577
##    smoking_status mmrc hosp_5_year
## 1               2    2       FALSE
## 2               0    1        TRUE
## 3               1    1       FALSE
## 4               1    1       FALSE
## 5               2    1       FALSE
## 6               1    2       FALSE
## 7               1    3        TRUE
## 8               1    0       FALSE
## 9               0    2        TRUE
## 10              1    1        TRUE
## 11              1    0       FALSE
## 12              1    3        TRUE
## 13              0    2       FALSE
## 14              1    3       FALSE
## 15              0    0       FALSE
## 16              2    1       FALSE
## 17              1    1       FALSE
## 18              1    2        TRUE
## 19              0    4       FALSE
## 20              1    1       FALSE
## 21              1    1        TRUE
## 22              1    2       FALSE
## 23              1    4       FALSE
## 24              1    1       FALSE
## 25              1    4       FALSE
## 26              2    2        TRUE
## 27              1    1       FALSE
## 28              1    3        TRUE
## 29              2    3        TRUE
## 30              0    0       FALSE
## 31              2    1       FALSE
## 32              2    0        TRUE
## 33              1    1       FALSE
## 34              1    3        TRUE
## 35              1    2       FALSE
## 36              1    0       FALSE
## 37              1    1       FALSE
## 38              1    4       FALSE
## 39              1    1       FALSE
## 40              1    0       FALSE
## 41              1    1        TRUE
## 42              1    3       FALSE
## 43              1    0       FALSE
## 44              1    1       FALSE
## 45              1    2       FALSE
##  [ reached 'max' / getOption("max.print") -- omitted 45395 rows ]
#We can quickly fit models to this and assess normality of residuals for predictors
modelFit1 <- glm(data=imp.comp%>%filter(.imp==1), formula=hosp_5_year ~ hosp_past_year + sex + age + height + weight + bmi + fev1 + fvc + fev1_fvc + exp(spo2+101) + mmef + pef + dlco + mmrc + smoking_status, family="binomial")
summary(modelFit1)
## 
## Call:
## glm(formula = hosp_5_year ~ hosp_past_year + sex + age + height + 
##     weight + bmi + fev1 + fvc + fev1_fvc + exp(spo2 + 101) + 
##     mmef + pef + dlco + mmrc + smoking_status, family = "binomial", 
##     data = imp.comp %>% filter(.imp == 1))
## 
## Coefficients:
##                      Estimate Std. Error z value Pr(>|z|)    
## (Intercept)        -3.250e+00  2.232e-01 -14.559  < 2e-16 ***
## hosp_past_yearTRUE  1.320e+00  1.283e-01  10.288  < 2e-16 ***
## sex1                1.770e-01  1.356e-01   1.305  0.19180    
## age                -9.468e-02  1.180e-01  -0.802  0.42249    
## height              2.915e-01  4.547e-01   0.641  0.52145    
## weight             -4.497e-01  9.282e-01  -0.484  0.62805    
## bmi                 7.291e-01  7.911e-01   0.922  0.35675    
## fev1               -5.861e-01  6.021e-01  -0.974  0.33030    
## fvc                -4.362e-01  4.112e-01  -1.061  0.28879    
## fev1_fvc           -8.905e-01  3.017e-01  -2.952  0.00316 ** 
## exp(spo2 + 101)     1.381e-45  9.033e-46   1.529  0.12620    
## mmef                1.072e-03  2.601e-01   0.004  0.99671    
## pef                -5.288e-02  2.412e-01  -0.219  0.82643    
## dlco               -7.040e-01  1.671e-01  -4.213 2.52e-05 ***
## mmrc1               2.569e-01  1.570e-01   1.636  0.10187    
## mmrc2               3.476e-01  1.665e-01   2.087  0.03688 *  
## mmrc3               4.514e-01  1.682e-01   2.683  0.00729 ** 
## mmrc4               4.911e-01  1.742e-01   2.818  0.00483 ** 
## smoking_status1     4.521e-01  1.652e-01   2.737  0.00620 ** 
## smoking_status2     9.166e-01  1.823e-01   5.029 4.94e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 3945.5  on 4543  degrees of freedom
## Residual deviance: 3236.7  on 4524  degrees of freedom
## AIC: 3276.7
## 
## Number of Fisher Scoring iterations: 6
plot(modelFit1)

anova(modelFit1, test="Chisq")
## Analysis of Deviance Table
## 
## Model: binomial, link: logit
## 
## Response: hosp_5_year
## 
## Terms added sequentially (first to last)
## 
## 
##                 Df Deviance Resid. Df Resid. Dev  Pr(>Chi)    
## NULL                             4543     3945.5              
## hosp_past_year   1   196.67      4542     3748.8 < 2.2e-16 ***
## sex              1     4.44      4541     3744.4 0.0350861 *  
## age              1    20.99      4540     3723.4 4.607e-06 ***
## height           1    15.24      4539     3708.1 9.479e-05 ***
## weight           1     6.17      4538     3702.0 0.0130285 *  
## bmi              1     1.69      4537     3700.3 0.1929467    
## fev1             1   354.29      4536     3346.0 < 2.2e-16 ***
## fvc              1    16.78      4535     3329.2 4.188e-05 ***
## fev1_fvc         1    14.29      4534     3314.9 0.0001567 ***
## exp(spo2 + 101)  1     4.50      4533     3310.4 0.0339895 *  
## mmef             1     0.00      4532     3310.4 0.9540787    
## pef              1     1.05      4531     3309.4 0.3065100    
## dlco             1    32.71      4530     3276.6 1.071e-08 ***
## mmrc             4     9.97      4526     3266.7 0.0409319 *  
## smoking_status   2    30.00      4524     3236.7 3.061e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
res <- residuals(modelFit1, type="deviance")
plot(log(predict(modelFit1)), res)
## Warning in log(predict(modelFit1)): NaNs produced
abline(h=0, lty=2)

qqnorm(res)
qqline(res)

# Tasks to do
# 
# Create and save imputed datasets
# Create model with no feature selection, run prediction on test set.
# Create model with feature selection, regularisation, etc. on test set.
# Create XGBoost model on test set.
# Compare performances for all
means[["spo2"]]
## [1] 1.513808